library(dada2)
packageVersion("dada2") # check dada2 version
## [1] '1.21.0'
library(Biostrings)
library(ShortRead)
library(seqTools) # per base sequence content
library(phyloseq)
library(ggplot2)
library(data.table)
library(plyr)
library(dplyr)
library(qckitfastq) # per base sequence content
library(stringr)
First, we import the fastq files containing the raw reads. The samples were downloaded from the SRA database with the accession number PRJEB37924. Only sigmoid biopsy samples that were analyzed by 16S rRNA sequencing were downloaded (see 00_Metadata-Mars).
# Save the path to the directory containing the fastq zipped files
path <- "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/Mars-2020"
# list.files(path) # check we are in the right directory
# fastq filenames have format: SAMPLENAME.fastq.gz
# Saves the whole directory path to each file name
fnFs <- sort(list.files(file.path(path, "original"), pattern="_1.fastq.gz", full.names = TRUE)) # FWD reads
fnRs <- sort(list.files(file.path(path, "original"), pattern="_2.fastq.gz", full.names = TRUE)) # REV reads
show(fnFs[1:5])
show(fnRs[1:5])
# Extract sample names, assuming filenames have format: SAMPLENAME.fastq.gz
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
show(sample.names[1:5]) # saves only the file name (without the path)
# Look at quality of all files
for (i in 1:3){ # 1:length(fnFs)
show(plotQualityProfile(fnFs[i]))
show(plotQualityProfile(fnRs[i]))
}
# Look at nb of reads per sample
# raw_stats <- data.frame('sample' = sample.names,
# 'reads' = fastqq(fnFs)@nReads)
# min(raw_stats$reads)
# max(raw_stats$reads)
# mean(raw_stats$reads)
We will have a quick peak at the per base sequence content of the reads in some samples, to make sure there is no anomaly (i.e. all reads having the same sequence).
# Look at per base sequence content (forward read)
fseqF1 <- seqTools::fastqq(fnFs[10])
## [fastqq] File ( 1/1) '/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/Mars-2020/original/ERR5169592_1.fastq.gz' done.
rcF1 <- read_content(fseqF1)
plot_read_content(rcF1) + labs(title = "Per base sequence content - Forward read")
fseqR1 <- seqTools::fastqq(fnRs[10])
## [fastqq] File ( 1/1) '/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/Mars-2020/original/ERR5169592_2.fastq.gz' done.
rcR1 <- read_content(fseqR1)
plot_read_content(rcR1) + labs(title = "Per base sequence content - Reverse read")
Now, we will look whether the reads still contain the primers. The methods section indicates that the V4 region of the 16S rRNA gene was amplified as described in the paper by . Primer sequences are shared in that paper.
# V4
FWD <- "GTGCCAGCMGCCGCGGTAA" # F515 forward primer sequence
REV <- "GGACTACHVGGGTWTCTAAT" # R806 reverse primer sequence
# Function that, from the primer sequence, will return all combinations possible (complement, reverse complement, ...)
allOrients <- function(primer) {
# Create all orientations of the input sequence
require(Biostrings)
dna <- DNAString(primer) # The Biostrings works w/ DNAString objects rather than character vectors
orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna),
RevComp = reverseComplement(dna))
return(sapply(orients, toString))
}
# Get all combinations of the primer sequences
FWD.orients <- allOrients(FWD) # F515
REV.orients <- allOrients(REV) # R806
FWD.orients # sanity check
## Forward Complement Reverse
## "GTGCCAGCMGCCGCGGTAA" "CACGGTCGKCGGCGCCATT" "AATGGCGCCGMCGACCGTG"
## RevComp
## "TTACCGCGGCKGCTGGCAC"
REV.orients
## Forward Complement Reverse
## "GGACTACHVGGGTWTCTAAT" "CCTGATGDBCCCAWAGATTA" "TAATCTWTGGGVHCATCAGG"
## RevComp
## "ATTAGAWACCCBDGTAGTCC"
# Function that counts number of reads in which a sequence is found
primerHits <- function(primer, fn) {
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE, max.mismatch = 2)
return(sum(nhits > 0))
}
# Get a table to know how many times the U515 and E786 primers are found in the reads of each sample
for (i in 1:3){
cat("SAMPLE", sample.names[i], "with total number of", raw_stats[i,'reads'], "reads\n\n")
x <- rbind(ForwardRead.FWDPrimer = sapply(FWD.orients, primerHits, fn = fnFs[[i]]),
ForwardRead.REVPrimer = sapply(REV.orients, primerHits, fn = fnFs[[i]]),
ReverseRead.FWDPrimer = sapply(FWD.orients, primerHits, fn = fnRs[[i]]),
ReverseRead.REVPrimer = sapply(REV.orients, primerHits, fn = fnRs[[i]]))
print(x)
cat("\n____________________________________________\n\n")
}
## SAMPLE ERR5169583 with total number of 105593 reads
##
## Forward Complement Reverse RevComp
## ForwardRead.FWDPrimer 102879 0 0 4
## ForwardRead.REVPrimer 4 0 0 96739
## ReverseRead.FWDPrimer 4 0 0 83295
## ReverseRead.REVPrimer 103571 0 0 3
##
## ____________________________________________
##
## SAMPLE ERR5169584 with total number of 129697 reads
##
## Forward Complement Reverse RevComp
## ForwardRead.FWDPrimer 125832 0 0 13
## ForwardRead.REVPrimer 13 0 0 118601
## ReverseRead.FWDPrimer 13 0 0 103070
## ReverseRead.REVPrimer 127174 0 0 13
##
## ____________________________________________
##
## SAMPLE ERR5169585 with total number of 70485 reads
##
## Forward Complement Reverse RevComp
## ForwardRead.FWDPrimer 68860 0 0 1
## ForwardRead.REVPrimer 1 0 0 64299
## ReverseRead.FWDPrimer 1 0 0 55897
## ReverseRead.REVPrimer 69315 0 0 1
##
## ____________________________________________
Almost all reads contain the forward & reverse primers. Ideally, to follow the standardized pipeline, we should (1) filter out reads not containing any primer and (2) trim the primers. However, the fastq files deposited on the SRA/ENA databases do not contain Illumina headers. Thus, during the merging step of paired reads later on, the Dada2 package won’t be able to match paired-reads.
We will simply perform a quality filtering of the reads, and trim the first 20 and last 30 bases of the reads (containing the primers)
Then, we perform a quality filtering of the reads.
# Place filtered files in a filtered/ subdirectory
FWD.filt2_samples <- file.path(path, "filtered2", paste0(sample.names, "_F_filt2.fastq.gz")) # FWD reads
REV.filt2_samples <- file.path(path, "filtered2", paste0(sample.names, "_R_filt2.fastq.gz")) # REV reads
# Assign names for the filtered fastq.gz files
names(FWD.filt2_samples) <- sample.names
names(REV.filt2_samples) <- sample.names
# Filter
out2 <- filterAndTrim(fwd = fnFs, filt = FWD.filt2_samples,
rev = fnRs, filt.rev = REV.filt2_samples,
trimLeft = 20, trimRight=30, # trim primers
maxEE=3, # reads with more than 3 expected errors (sum(10e(-Q/10))) are discarded
truncQ=10, # Truncate reads at the first instance of a quality score less than or equal to truncQ.
minLen=150, # Discard reads shorter than 150 bp.
compress=TRUE,
multithread = TRUE,
verbose=TRUE)
Let’s look at the output filtered fastq files as sanity check.
out2[1:6,] # show how many reads were filtered in each file
## reads.in reads.out
## ERR5169583_1.fastq.gz 105593 67882
## ERR5169584_1.fastq.gz 129697 84891
## ERR5169585_1.fastq.gz 70485 46101
## ERR5169586_1.fastq.gz 86720 56496
## ERR5169587_1.fastq.gz 88688 58512
## ERR5169588_1.fastq.gz 44429 21535
# Look at quality profile of all filtered files
for (i in 1:3){
show(plotQualityProfile(FWD.filt2_samples[i]))
show(plotQualityProfile(REV.filt2_samples[i]))
}
Now we will build the parametric error model, to be able to infer amplicon sequence variants (ASVs) later on.
errF <- learnErrors(FWD.filt2_samples, multithread=TRUE, randomize=TRUE, verbose = 1)
errR <- learnErrors(REV.filt2_samples, multithread=TRUE, randomize=TRUE, verbose = 1)
The error rates for each possible transition (A→C, A→G, …) are shown. Points are the observed error rates for each consensus quality score. The black line shows the estimated error rates after convergence of the machine-learning algorithm. The red line shows the error rates expected under the nominal definition of the Q-score. Here the estimated error rates (black line) are a good fit to the observed rates (points), and the error rates drop with increased quality as expected.
plotErrors(errF, nominalQ = TRUE) # Forward reads
plotErrors(errR, nominalQ = TRUE) # Reverse reads
The dada() algorithm infers sequence variants based on estimated errors (previous step). Firstly, we de-replicate the reads in each sample, to reduce the computation time. De-replication is a common step in almost all modern ASV inference (or OTU picking) pipelines, but a unique feature of derepFastq is that it maintains a summary of the quality information for each dereplicated sequence in $quals.
# Dereplicate the reads in the sample
derepF <- derepFastq(FWD.filt2_samples) # forward
derepR <- derepFastq(REV.filt2_samples) # reverse
# Infer sequence variants
dadaFs <- dada(derepF, err=errF, multithread=TRUE) # forward
dadaRs <- dada(derepR, err=errR, multithread=TRUE) # reverse
# Inspect the infered sequence variants
for (i in 1:3){
print(dadaFs[[i]])
print(dadaRs[[i]])
print("________________")
}
## dada-class: object describing DADA2 denoising results
## 217 sequence variants were inferred from 13852 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## dada-class: object describing DADA2 denoising results
## 134 sequence variants were inferred from 16309 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 226 sequence variants were inferred from 16473 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## dada-class: object describing DADA2 denoising results
## 125 sequence variants were inferred from 19321 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 188 sequence variants were inferred from 9899 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## dada-class: object describing DADA2 denoising results
## 111 sequence variants were inferred from 11937 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
We now need to merge paired reads.
mergers <- mergePairs(dadaFs, derepF, dadaRs, derepR, verbose=TRUE)
head(mergers[[1]])
We can now construct an amplicon sequence variant table (ASV) table, a higher-resolution version of the OTU table produced by traditional methods.
# Make sequence table from the infered sequence variants
seqtable <- makeSequenceTable(mergers)
# We should have 72 samples (72 rows)
dim(seqtable)
## [1] 72 3733
# Inspect distribution of sequence lengths
hist(nchar(getSequences(seqtable)), breaks = 100, xlab = "ASV length", ylab = "Number of ASVs", main="")
# Sequences should be between 515F - 806R, so around ~250bp (removing the primers lengths).
# Remove any sequence variant below outside 200-300bp
seqtable.new <- seqtable[,nchar(colnames(seqtable)) %in% 200:300]
dim(seqtable.new)
## [1] 72 1738
hist(nchar(getSequences(seqtable.new)), breaks = 100, xlab = "ASV length", ylab = "Number of ASVs", main="")
The core dada method corrects substitution and indel errors, but chimeras remain. Fortunately, the accuracy of sequence variants after denoising makes identifying chimeric ASVs simpler than when dealing with fuzzy OTUs. Chimeric sequences are identified if they can be exactly reconstructed by combining a left-segment and a right-segment from two more abundant “parent” sequences.
seqtable.nochim <- removeBimeraDenovo(seqtable.new, method="consensus", multithread=TRUE, verbose=TRUE)
# Check how many sequence variants we have after removing chimeras
dim(seqtable.nochim)
## [1] 72 1678
# Check how many reads we have after removing chimeras (we should keep the vast majority of the reads, like > 80%)
sum(seqtable.nochim)/sum(seqtable.new)
## [1] 0.9963381
Sanity check before assigning taxonomy.
# Function that counts nb of reads
getN <- function(x) sum(getUniques(x))
# Table that will count number of reads for each process of interest (input reads, filtered reads, denoised reads, non chimera reads)
track <- cbind(out2,
sapply(dadaFs, getN),
sapply(dadaRs, getN),
sapply(mergers, getN),
rowSums(seqtable.nochim),
lapply(rowSums(seqtable.nochim)*100/out2[,1], as.integer))
# Assign column and row names
colnames(track) <- c("input", "quality-filt", "denoisedF", "denoisedR", 'merged', 'nonchim', "%input->output")
rownames(track) <- sample.names
# Show final table: for each row/sample, we have shown the initial number of reads, filtered reads, denoised reads, and non chimera reads
track
## input quality-filt denoisedF denoisedR merged nonchim
## ERR5169583 105593 67882 67514 67468 64359 46170
## ERR5169584 129697 84891 84386 84242 81354 63684
## ERR5169585 70485 46101 45855 45742 44299 36610
## ERR5169586 86720 56496 56148 56038 53806 45822
## ERR5169587 88688 58512 58086 58008 55905 38849
## ERR5169588 44429 21535 21249 21253 20536 20392
## ERR5169589 97536 61839 61359 61297 60215 27380
## ERR5169590 113298 73704 73246 73047 69120 46633
## ERR5169591 70662 45311 44986 44815 41039 39904
## ERR5169592 84848 53043 52654 52515 50040 35368
## ERR5169593 70070 45741 45278 45200 43624 33508
## ERR5169594 70514 46313 46066 45968 44669 33429
## ERR5169595 81234 51884 51578 51518 49541 45070
## ERR5169596 62654 39902 39785 39663 37516 35740
## ERR5169597 123998 82637 82288 82277 81354 47033
## ERR5169598 105876 71672 71412 71414 70545 26973
## ERR5169599 105644 72969 72674 72648 72158 6350
## ERR5169600 79270 50487 50177 50177 47739 46312
## ERR5169601 76041 47762 47346 47320 45669 37142
## ERR5169602 87122 59028 58789 58775 58113 4073
## ERR5169603 73742 48317 48114 48084 47010 37223
## ERR5169604 74329 46552 46278 46151 44192 40664
## ERR5169605 55292 36072 35901 35734 34195 29936
## ERR5169606 101878 66803 66575 66609 64616 61512
## ERR5169607 83029 52595 52324 52211 50107 43446
## ERR5169608 82800 56168 55822 55799 54485 18688
## ERR5169609 73407 48392 48111 48109 45666 42662
## ERR5169610 69347 47775 47518 47440 46274 20812
## ERR5169611 70806 45827 45462 45284 43149 36655
## ERR5169612 77124 52468 52161 52163 51271 19178
## ERR5169613 68381 43427 43265 43233 41903 38519
## ERR5169614 74287 46555 46216 46183 44623 34923
## ERR5169615 64188 41889 41656 41529 39654 34861
## ERR5169616 63924 40179 40011 39896 37615 30415
## ERR5169617 91806 63077 62795 62710 61169 19465
## ERR5169618 89209 60319 60007 60032 57906 27654
## ERR5169619 1633 701 680 633 562 562
## ERR5169620 72249 44767 44512 44430 42735 26892
## ERR5169621 86114 55549 55331 55240 52425 35200
## ERR5169622 52406 34619 34458 34469 32990 28419
## ERR5169623 97958 61968 61563 61422 59478 41553
## ERR5169624 65330 41531 41278 41111 38971 35218
## ERR5169625 55228 34342 34109 34007 32495 26377
## ERR5169626 75144 45993 45734 45536 42554 39393
## ERR5169627 65796 41340 41062 40964 38875 33949
## ERR5169628 67956 41265 40873 40627 37816 31093
## ERR5169629 72891 47614 47288 47245 45526 34920
## ERR5169630 83538 54506 54174 54242 51372 46832
## ERR5169631 67169 42595 42353 42336 41079 33441
## ERR5169632 53835 34606 34419 34322 32417 29662
## ERR5169633 52155 34003 33830 33732 32101 24020
## ERR5169634 76435 48921 48598 48472 45800 42907
## ERR5169635 53442 34376 34049 33930 31764 27063
## ERR5169636 41211 26655 26514 26453 25099 22439
## ERR5169637 46747 29274 29102 29119 27417 26349
## ERR5169638 76795 46229 45999 45990 45442 32566
## ERR5169639 71235 42595 42413 42382 40974 30186
## ERR5169640 39718 23558 23388 23360 22604 15252
## ERR5169641 68115 44439 44340 44329 43235 36168
## ERR5169642 17042 11307 11165 11140 11037 421
## ERR5169643 26208 16351 16162 16060 15450 12128
## ERR5169644 64295 40761 40620 40574 39237 32567
## ERR5169645 50906 30554 30265 30319 28395 26992
## ERR5169646 73257 47766 47571 47625 46251 31675
## ERR5169647 8075 5306 5216 5196 5124 247
## ERR5169648 7421 4659 4464 4441 4124 1955
## ERR5169649 47727 31004 30826 30859 28775 27661
## ERR5169650 75469 49539 49283 49101 46863 43825
## ERR5169651 17381 10780 10699 10526 9696 5338
## ERR5169652 14220 8886 8816 8677 8159 7661
## ERR5169653 49556 32426 32132 32058 30825 26002
## ERR5169654 83283 56457 55932 55870 54652 22463
## %input->output
## ERR5169583 43
## ERR5169584 49
## ERR5169585 51
## ERR5169586 52
## ERR5169587 43
## ERR5169588 45
## ERR5169589 28
## ERR5169590 41
## ERR5169591 56
## ERR5169592 41
## ERR5169593 47
## ERR5169594 47
## ERR5169595 55
## ERR5169596 57
## ERR5169597 37
## ERR5169598 25
## ERR5169599 6
## ERR5169600 58
## ERR5169601 48
## ERR5169602 4
## ERR5169603 50
## ERR5169604 54
## ERR5169605 54
## ERR5169606 60
## ERR5169607 52
## ERR5169608 22
## ERR5169609 58
## ERR5169610 30
## ERR5169611 51
## ERR5169612 24
## ERR5169613 56
## ERR5169614 47
## ERR5169615 54
## ERR5169616 47
## ERR5169617 21
## ERR5169618 30
## ERR5169619 34
## ERR5169620 37
## ERR5169621 40
## ERR5169622 54
## ERR5169623 42
## ERR5169624 53
## ERR5169625 47
## ERR5169626 52
## ERR5169627 51
## ERR5169628 45
## ERR5169629 47
## ERR5169630 56
## ERR5169631 49
## ERR5169632 55
## ERR5169633 46
## ERR5169634 56
## ERR5169635 50
## ERR5169636 54
## ERR5169637 56
## ERR5169638 42
## ERR5169639 42
## ERR5169640 38
## ERR5169641 53
## ERR5169642 2
## ERR5169643 46
## ERR5169644 50
## ERR5169645 53
## ERR5169646 43
## ERR5169647 3
## ERR5169648 26
## ERR5169649 57
## ERR5169650 58
## ERR5169651 30
## ERR5169652 53
## ERR5169653 52
## ERR5169654 26
Extensions: The dada2 package also implements a method to make species level assignments based on exact matching between ASVs and sequenced reference strains. Recent analysis suggests that exact matching (or 100% identity) is the only appropriate way to assign species to 16S gene fragments. Currently, species-assignment training fastas are available for the Silva and RDP 16S databases. To follow the optional species addition step, download the silva_species_assignment_v132.fa.gz file, and place it in the directory with the fastq files.
# Assign taxonomy (with silva v138)
taxa <- assignTaxonomy(seqtable.nochim, "~/Projects/IBS_Meta-analysis_16S/data/silva_nr_v138_train_set.fa.gz",
tryRC = TRUE, # try reverse complement of the sequences
multithread=TRUE, verbose = TRUE)
# Add species assignment
taxa <- addSpecies(taxa, "~/Projects/IBS_Meta-analysis_16S/data/silva_species_assignment_v138.fa.gz")
# Check how the taxonomy table looks like
taxa.print <- taxa
rownames(taxa.print) <- NULL # Removing sequence rownames for display only
head(taxa.print)
## Kingdom Phylum Class Order
## [1,] "Bacteria" "Firmicutes" "Clostridia" "Lachnospirales"
## [2,] "Bacteria" "Bacteroidota" "Bacteroidia" "Bacteroidales"
## [3,] "Bacteria" "Firmicutes" "Clostridia" "Lachnospirales"
## [4,] "Bacteria" "Firmicutes" "Clostridia" "Oscillospirales"
## [5,] "Bacteria" "Firmicutes" "Clostridia" "Oscillospirales"
## [6,] "Bacteria" "Firmicutes" "Clostridia" "Lachnospirales"
## Family Genus Species
## [1,] "Lachnospiraceae" "Blautia" NA
## [2,] "Bacteroidaceae" "Bacteroides" "vulgatus"
## [3,] "Lachnospiraceae" "Agathobacter" NA
## [4,] "Ruminococcaceae" "Faecalibacterium" NA
## [5,] "Ruminococcaceae" "Faecalibacterium" "prausnitzii"
## [6,] "Lachnospiraceae" "Fusicatenibacter" "saccharivorans"
table(taxa.print[,1]) # Show the different kingdoms (should be only bacteria)
##
## Archaea Bacteria Eukaryota
## 4 1560 74
taxa.print[taxa.print[,1] == 'Eukaryota',2] # the eukaryota are all NAs
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [26] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [51] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [76] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [101] NA NA NA NA NA NA NA NA NA NA NA NA NA NA
table(taxa.print[,2]) # Show the different phyla
##
## Abditibacteriota Acidobacteriota Actinobacteriota Bacteroidota
## 1 3 81 202
## Campilobacterota Cyanobacteria Desulfobacterota Euryarchaeota
## 7 19 13 2
## Fibrobacterota Firmicutes Fusobacteriota Patescibacteria
## 1 1072 22 12
## Proteobacteria Spirochaetota Synergistota Thermoplasmatota
## 108 4 3 2
## Thermotogota Verrucomicrobiota
## 1 8
table(is.na(taxa.print[,2])) # is there any NA phyla?
##
## FALSE TRUE
## 1561 117
We will remove any sample with less than 500 reads from further analysis, and also any ASVs with unassigned phyla.
The preprocessing will be easier to do with ASV, taxonomic and metadata tables combined in a phyloseq object.
#_________________________
# Import metadata
metadata_table <- read.csv(file.path(path, "00_Metadata-Mars/modif_metadata(R).csv")) %>% select(-X)
colnames(metadata_table)
rownames(metadata_table) <- metadata_table$Run
# Add a few covariates
metadata_table$author <- "Mars"
metadata_table$sequencing_tech <- "Illumina"
metadata_table$variable_region <- "V4"
# Remove _F_filt2.fastq.gz from rownames
# rownames(seqtable.nochim)[1:5]
rownames(seqtable.nochim) <- sub("_.*", "", rownames(seqtable.nochim))
#_________________________
# Create phyloseq object
physeq <- phyloseq(otu_table(seqtable.nochim, taxa_are_rows=FALSE), # by default, in otu_table the sequence variants are in rows
sample_data(metadata_table),
tax_table(taxa))
# Remove taxa that are eukaryota, or have unassigned Phyla
physeq <- subset_taxa(physeq, Kingdom != "Eukaryota")
physeq <- subset_taxa(physeq, !is.na(Phylum))
# Remove samples with less than 500 reads
physeq <- prune_samples(sample_sums(physeq)>=500, physeq)
# Save to disk
saveRDS(raw_stats, file.path(path, "01_Dada2-Mars/raw_stats.rds"))
saveRDS(out2, file.path(path, "01_Dada2-Mars/out2.rds"))
saveRDS(errF, file.path(path, "01_Dada2-Mars/errF.rds"))
saveRDS(errR, file.path(path, "01_Dada2-Mars/errR.rds"))
saveRDS(dadaFs, file.path(path, "01_Dada2-Mars/infered_seq_F.rds"))
saveRDS(dadaRs, file.path(path, "01_Dada2-Mars/infered_seq_R.rds"))
saveRDS(mergers, file.path(path, "01_Dada2-Mars/mergers.rds"))
saveRDS(otu_table(physeq), file.path(path, "01_Dada2-Mars/ASVtable_final.rds")) # ASV table
saveRDS(tax_table(physeq), file.path(path, "01_Dada2-Mars/taxa_final.rds")) # taxa table
saveRDS(sample_data(physeq), file.path(path, "01_Dada2-Mars/metadata_final.rds")) # metadata table
# Taxa & Phyloseq object
saveRDS(taxa, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/CLUSTER/Taxonomy/output/taxa_mars.rds")
saveRDS(physeq, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/CLUSTER/PhyloTree/input/physeq_mars.rds")
# Absolute abundance
plot_bar(physeq, fill = "Phylum")+ facet_wrap("host_disease", scales="free_x") + theme(axis.text.x = element_blank())
# Relative abundance for Phylum
phylum.table <- physeq %>%
tax_glom(taxrank = "Phylum") %>% # agglomerate at phylum level
transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
psmelt() # Melt to long format
ggplot(phylum.table, aes(x = Sample, y = Abundance, fill = Phylum))+
facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(size = 5, angle = -90))+
labs(x = "Samples", y = "Relative abundance")